## Importing libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
## Reading dataset
df=pd.read_csv('ds_salaries.csv')
df.head()
| work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023 | SE | FT | Principal Data Scientist | 80000 | EUR | 85847 | ES | 100 | ES | L |
| 1 | 2023 | MI | CT | ML Engineer | 30000 | USD | 30000 | US | 100 | US | S |
| 2 | 2023 | MI | CT | ML Engineer | 25500 | USD | 25500 | US | 100 | US | S |
| 3 | 2023 | SE | FT | Data Scientist | 175000 | USD | 175000 | CA | 100 | CA | M |
| 4 | 2023 | SE | FT | Data Scientist | 120000 | USD | 120000 | CA | 100 | CA | M |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3755 entries, 0 to 3754 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 work_year 3755 non-null int64 1 experience_level 3755 non-null object 2 employment_type 3755 non-null object 3 job_title 3755 non-null object 4 salary 3755 non-null int64 5 salary_currency 3755 non-null object 6 salary_in_usd 3755 non-null int64 7 employee_residence 3755 non-null object 8 remote_ratio 3755 non-null int64 9 company_location 3755 non-null object 10 company_size 3755 non-null object dtypes: int64(4), object(7) memory usage: 322.8+ KB
df.shape #checking the dimension of dataset
(3755, 11)
## Checking missing values
df.isnull().sum()
work_year 0 experience_level 0 employment_type 0 job_title 0 salary 0 salary_currency 0 salary_in_usd 0 employee_residence 0 remote_ratio 0 company_location 0 company_size 0 dtype: int64
#removing unneccssary columns
df.drop(['salary','salary_currency'],axis=1,inplace=True)
df.duplicated() ## Checkking duplicate rows
0 False
1 False
2 False
3 False
4 False
...
3750 False
3751 False
3752 False
3753 False
3754 False
Length: 3755, dtype: bool
df[df.duplicated()] #Duplicate data output
| work_year | experience_level | employment_type | job_title | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|
| 115 | 2023 | SE | FT | Data Scientist | 150000 | US | 0 | US | M |
| 123 | 2023 | SE | FT | Analytics Engineer | 289800 | US | 0 | US | M |
| 153 | 2023 | MI | FT | Data Engineer | 100000 | US | 100 | US | M |
| 154 | 2023 | MI | FT | Data Engineer | 70000 | US | 100 | US | M |
| 160 | 2023 | SE | FT | Data Engineer | 115000 | US | 0 | US | M |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3439 | 2022 | MI | FT | Data Scientist | 78000 | US | 100 | US | M |
| 3440 | 2022 | SE | FT | Data Engineer | 135000 | US | 100 | US | M |
| 3441 | 2022 | SE | FT | Data Engineer | 115000 | US | 100 | US | M |
| 3586 | 2021 | MI | FT | Data Engineer | 200000 | US | 100 | US | L |
| 3709 | 2021 | MI | FT | Data Scientist | 90734 | DE | 50 | DE | L |
1171 rows × 9 columns
df.describe() #Descriptive analysis
| work_year | salary_in_usd | remote_ratio | |
|---|---|---|---|
| count | 3755.000000 | 3755.000000 | 3755.000000 |
| mean | 2022.373635 | 137570.389880 | 46.271638 |
| std | 0.691448 | 63055.625278 | 48.589050 |
| min | 2020.000000 | 5132.000000 | 0.000000 |
| 25% | 2022.000000 | 95000.000000 | 0.000000 |
| 50% | 2022.000000 | 135000.000000 | 0.000000 |
| 75% | 2023.000000 | 175000.000000 | 100.000000 |
| max | 2023.000000 | 450000.000000 | 100.000000 |
df.columns
Index(['work_year', 'experience_level', 'employment_type', 'job_title',
'salary_in_usd', 'employee_residence', 'remote_ratio',
'company_location', 'company_size'],
dtype='object')
for methods in df.columns:
print(methods,":",df[methods].nunique()) #Number of unique values in dataframe
work_year : 4 experience_level : 4 employment_type : 4 job_title : 93 salary_in_usd : 1035 employee_residence : 78 remote_ratio : 3 company_location : 72 company_size : 3
#Exploratory Data analysis
df['experience_level'].unique()
array(['SE', 'MI', 'EN', 'EX'], dtype=object)
Here We can see there are 4 unique values which are:
df['employment_type'].unique()
array(['FT', 'CT', 'FL', 'PT'], dtype=object)
It has 4 unique values which are :
#Top 15 Job title
job_title=df['job_title'].value_counts().head(15)
job_title
Data Engineer 1040 Data Scientist 840 Data Analyst 612 Machine Learning Engineer 289 Analytics Engineer 103 Data Architect 101 Research Scientist 82 Data Science Manager 58 Applied Scientist 58 Research Engineer 37 ML Engineer 34 Data Manager 29 Machine Learning Scientist 26 Data Science Consultant 24 Data Analytics Manager 22 Name: job_title, dtype: int64
x=df['job_title'].value_counts().head(15).index
y=df['job_title'].value_counts().head(15).values
fig=go.Figure(data=go.Bar(x=x,y=y))
fig.update_xaxes(title="Job title")
fig.update_yaxes(title="Frequency")
fig.update_layout(title="Top 15 job Title",width=550,
height=600)
fig.show()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3755 entries, 0 to 3754 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 work_year 3755 non-null int64 1 experience_level 3755 non-null object 2 employment_type 3755 non-null object 3 job_title 3755 non-null object 4 salary_in_usd 3755 non-null int64 5 employee_residence 3755 non-null object 6 remote_ratio 3755 non-null int64 7 company_location 3755 non-null object 8 company_size 3755 non-null object dtypes: int64(3), object(6) memory usage: 264.1+ KB
##Average Salaries Based On Job Titles
df.groupby('job_title').mean()['salary_in_usd']
job_title
3D Computer Vision Researcher 21352.250000
AI Developer 136666.090909
AI Programmer 55000.000000
AI Scientist 110120.875000
Analytics Engineer 152368.631068
...
Research Engineer 163108.378378
Research Scientist 161214.195122
Software Data Engineer 62510.000000
Staff Data Analyst 15000.000000
Staff Data Scientist 105000.000000
Name: salary_in_usd, Length: 93, dtype: float64
## Maximum Salary getting job role
df.groupby('job_title').mean()['salary_in_usd'].idxmax()
'Data Science Tech Lead'
df.groupby('job_title').mean()['salary_in_usd'].max()
375000.0
### Salaries based on Experience level
exp=df.groupby('experience_level').mean()['salary_in_usd']
exp
experience_level EN 78546.284375 EX 194930.929825 MI 104525.939130 SE 153051.071542 Name: salary_in_usd, dtype: float64
## Maximum Salary getting job role
df.groupby('experience_level').mean()['salary_in_usd'].idxmax()
'EX'
df.groupby('experience_level').mean()['salary_in_usd'].max()
194930.9298245614
X=df.groupby('experience_level').mean()['salary_in_usd'].index
Y=df.groupby('experience_level').mean()['salary_in_usd'].values
fig=go.Figure(data=go.Bar(x=X,y=Y))
fig.update_xaxes(title="Job role")
fig.update_yaxes(title="Frequency")
fig.update_layout(title="Job role acc. to experience",width=500,
height=500)
fig.show()
##Salaries based on Employment Type
ET=df['salary_in_usd'].groupby(df['employment_type']).mean()
ET
employment_type CT 113446.900000 FL 51807.800000 FT 138314.199570 PT 39533.705882 Name: salary_in_usd, dtype: float64
X1=df['salary_in_usd'].groupby(df['employment_type']).mean().index
Y1=df['salary_in_usd'].groupby(df['employment_type']).mean().values
fig=go.Figure(data=go.Bar(x=X1,y=Y1))
fig.update_xaxes(title="Emplyement Type")
fig.update_yaxes(title="Frequency")
fig.update_layout(title="Employment type with salary",width=500,
height=500)
fig.show()
##Average Salaries based on Year(2020-2023)
WY=df['salary_in_usd'].groupby(df['work_year']).mean()
WY
work_year 2020 92302.631579 2021 94087.208696 2022 133338.620793 2023 149045.541176 Name: salary_in_usd, dtype: float64
X2=df['salary_in_usd'].groupby(df['work_year']).mean().index
Y2=df['salary_in_usd'].groupby(df['work_year']).mean().values
fig=go.Figure(data=go.Line(x=X2,y=Y2))
fig.update_xaxes(title="Emplyement Type")
fig.update_yaxes(title="Frequency")
fig.update_layout(title='Salaries based on Year(2020-23)',width=500,
height=500)
fig.show()